Load libraries
library(msa)
library(ape)
library(seqinr)
library(ggtree)
library(tidyverse)
List all dereplicate fasta files
directory <- "./derep_fasta"
files <- list.files(path=directory, pattern=".fasta")
for (i in files){
print(paste0(directory, "/", i))
}
## [1] "./derep_fasta/derep_anisakidae_18s.fasta"
## [1] "./derep_fasta/derep_anisakidae_coi.fasta"
## [1] "./derep_fasta/derep_anisakidae_coi5p.fasta"
## [1] "./derep_fasta/derep_anisakidae_coii.aligned.fasta"
## [1] "./derep_fasta/derep_anisakidae_coii.fasta"
## [1] "./derep_fasta/derep_anisakidae_its.fasta"
Dereplicate anisakidae 18S
Dereplicate anisakidae COX1
# read in fasta file
fcontent <- readLines("dedup_fasta/derep_anisakidae_coi5p.fasta")
seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))
mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)
for (i in 1:nrow(mSeqPos)){
sequ <- ""
for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
sequ <- paste0(sequ, fcontent[j])
}
mSeqPos[i, "sequence"] <- sequ
}
seqDb <- mSeqPos %>%
as.data.frame() %>%
rownames_to_column('id') %>%
filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
mutate(id=gsub('cf. ', '', id)) %>%
mutate(id=gsub('aff. ', '', id)) %>%
mutate(id=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
mutate(genus=word(id, 2)) %>%
mutate(species=paste(word(id, 2), word(id,3))) %>%
select(id, genus, species, sequence) %>%
column_to_rownames('id')
# Step 1: take the DNA sequence
DNASEQ <- seqDb[, 'sequence']
names(DNASEQ) <- rownames(seqDb)
# Step 2: convert to DNAStringSet
DNASS <- DNAStringSet(DNASEQ)
# or directly read in fasta using readDNAStringSet
# DNASS <- readDNAStringSet("fasta/subseq_anisakidae_18s.fasta")
# DNASS <- readDNAStringSet("fasta/subseq_anisakidae_coi5p.fasta")
# DNASS <- readDNAStringSet("dedup_fasta/derep_anisakidae_its.fasta")
# Step 3: multiple sequence alignment
alignment <- msa(DNASS, 'ClustalOmega')
# Step 4: clustering and distances
alignment <- msaConvert(alignment, "seqinr::alignment")
distMatrix <- dist.alignment(alignment, 'similarity')
clustering <- hclust(distMatrix)
# Step 5: APE to make cladogram
phylotree <- as.phylo(clustering)
# plot(phylotree, type="cladogram")
ggtree(phylotree) +
geom_tiplab(size=2.5) +
ggexpand(ratio=1.2, side='h')
Dereplicate anisakidae COX2
(same as abovementioned)
Dereplicate anisakidae ITS
(same as abovementioned)
Dereplicate anisakidae 18S (ClustalW)
# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_18s.fasta")
seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))
mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)
for (i in 1:nrow(mSeqPos)){
sequ <- ""
for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
sequ <- paste0(sequ, fcontent[j])
}
mSeqPos[i, "sequence"] <- sequ
}
seqDb <- mSeqPos %>%
as.data.frame() %>%
rownames_to_column('id') %>%
#filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
mutate(id=gsub('cf. ', '', id)) %>%
mutate(id=gsub('aff. ', '', id)) %>%
mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
mutate(id=gsub('>', '', word(tmp, 1))) %>%
mutate(genus=word(tmp, 2)) %>%
mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
select(id, genus, species, sequence)
# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_18s.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>%
phytools::midpoint.root()
tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"
tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>%
mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]
mytree$tip.label <- tipLabelDf[["id"]]
p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
# geom_tiplab(size=2.5) +
# ggexpand(ratio=0.5, side="h")
p %<+% tipLabelDf +
geom_tiplab(size=2.5) +
theme_tree() +
geom_tippoint(aes(color=genus), size=2) +
ggexpand(ratio=0.4, side="h")

Dereplicate anisakidae COX1-5P (ClustalW)
# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_coi5p.fasta")
seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))
mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)
for (i in 1:nrow(mSeqPos)){
sequ <- ""
for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
sequ <- paste0(sequ, fcontent[j])
}
mSeqPos[i, "sequence"] <- sequ
}
seqDb <- mSeqPos %>%
as.data.frame() %>%
rownames_to_column('id') %>%
#filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
mutate(id=gsub('cf. ', '', id)) %>%
mutate(id=gsub('aff. ', '', id)) %>%
mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
mutate(id=gsub('>', '', word(tmp, 1))) %>%
mutate(genus=word(tmp, 2)) %>%
mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
select(id, genus, species, sequence)
# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_coi5p.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>%
phytools::midpoint.root()
tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"
tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>%
mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]
mytree$tip.label <- tipLabelDf[["id"]]
p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
# geom_tiplab(size=2.5) +
# ggexpand(ratio=0.5, side="h")
p %<+% tipLabelDf +
geom_tiplab(size=2.5) +
theme_tree() +
geom_tippoint(aes(color=genus), size=2) +
ggexpand(ratio=0.2, side="h")

Dereplicate anisakidae COX2 (ClustalW)
# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_coii.fasta")
seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))
mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)
for (i in 1:nrow(mSeqPos)){
sequ <- ""
for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
sequ <- paste0(sequ, fcontent[j])
}
mSeqPos[i, "sequence"] <- sequ
}
seqDb <- mSeqPos %>%
as.data.frame() %>%
rownames_to_column('id') %>%
#filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
mutate(id=gsub('cf. ', '', id)) %>%
mutate(id=gsub('aff. ', '', id)) %>%
mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
mutate(id=gsub('>', '', word(tmp, 1))) %>%
mutate(genus=word(tmp, 2)) %>%
mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
select(id, genus, species, sequence)
# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_coii.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>%
phytools::midpoint.root()
tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"
tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>%
mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]
mytree$tip.label <- tipLabelDf[["id"]]
p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
# geom_tiplab(size=2.5) +
# ggexpand(ratio=0.5, side="h")
p %<+% tipLabelDf +
geom_tiplab(size=1.2) +
theme_tree() +
geom_tippoint(aes(color=genus), size=2) +
ggexpand(ratio=0.2, side="h")

Dereplicate anisakidae ITS (ClustalW)
# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_its.fasta")
seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))
mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)
for (i in 1:nrow(mSeqPos)){
sequ <- ""
for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
sequ <- paste0(sequ, fcontent[j])
}
mSeqPos[i, "sequence"] <- sequ
}
seqDb <- mSeqPos %>%
as.data.frame() %>%
rownames_to_column('id') %>%
#filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
mutate(id=gsub('cf. ', '', id)) %>%
mutate(id=gsub('aff. ', '', id)) %>%
mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
mutate(id=gsub('>', '', word(tmp, 1))) %>%
mutate(genus=word(tmp, 2)) %>%
mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
select(id, genus, species, sequence)
# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_its.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>%
phytools::midpoint.root()
tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"
tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>%
mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]
mytree$tip.label <- tipLabelDf[["id"]]
p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
# geom_tiplab(size=2.5) +
# ggexpand(ratio=0.5, side="h")
p %<+% tipLabelDf +
geom_tiplab(size=2.5) +
theme_tree() +
geom_tippoint(aes(color=genus), size=2) +
ggexpand(ratio=0.2, side="h")

Dereplicate anisakidae COX2 (Muscle)
# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_coii.fasta")
seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))
mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)
for (i in 1:nrow(mSeqPos)){
sequ <- ""
for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
sequ <- paste0(sequ, fcontent[j])
}
mSeqPos[i, "sequence"] <- sequ
}
seqDb <- mSeqPos %>%
as.data.frame() %>%
rownames_to_column('id') %>%
#filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
mutate(id=gsub('cf. ', '', id)) %>%
mutate(id=gsub('aff. ', '', id)) %>%
mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
mutate(id=gsub('>', '', word(tmp, 1))) %>%
mutate(genus=word(tmp, 2)) %>%
mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
select(id, genus, species, sequence)
# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_coii.aligned.fasta", format="fasta")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>%
phytools::midpoint.root()
tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"
tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>%
mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]
mytree$tip.label <- tipLabelDf[["id"]]
p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
# geom_tiplab(size=2.5) +
# ggexpand(ratio=0.5, side="h")
p %<+% tipLabelDf +
geom_tiplab(size=1.2) +
theme_tree() +
geom_tippoint(aes(color=genus), size=2) +
ggexpand(ratio=0.2, side="h")
